In [1]:

    
import numpy as np
import pandas as pd

create preparation_info document



In [2]:

    
info = pd.read_csv('../data/Demultiplex_Sheet.txt', sep='\t')



In [3]:

    
info = info.sort_values(['#SampleID'])
info.head()









    Out[3]:







  
    
      
      #SampleID
      BarcodeSequence
      LinkerPrimerSequence
      ReversePrimer
      Description
    
  
  
    
      60
      Orwoll.BI0023.BI
      TCTGGTGACATT
      GGACTACHVGGGTWTCTAAT
      GTGCCAGCMGCCGCGGTAA
      Orwoll.BI0023.BI
    
    
      1129
      Orwoll.BI0023.BI
      TCTGGTGACATT
      GGACTACHVGGGTWTCTAAT
      GTGCCAGCMGCCGCGGTAA
      Orwoll.BI0023.BI
    
    
      649
      Orwoll.BI0056.BI
      CAAGCATGCCTA
      GGACTACHVGGGTWTCTAAT
      GTGCCAGCMGCCGCGGTAA
      Orwoll.BI0056.BI
    
    
      475
      Orwoll.BI0056.BI
      CAAGCATGCCTA
      GGACTACHVGGGTWTCTAAT
      GTGCCAGCMGCCGCGGTAA
      Orwoll.BI0056.BI
    
    
      75
      Orwoll.BI0131.BI
      CTATTTGCGACA
      GGACTACHVGGGTWTCTAAT
      GTGCCAGCMGCCGCGGTAA
      Orwoll.BI0131.BI



In [4]:

    
info = info.drop_duplicates(subset=['#SampleID', 'ReversePrimer'])
# save unduplicated data
info.to_csv('unduplicated_demultiplex.txt', sep= '\t', index=False)



In [5]:

    
info.head()









    Out[5]:







  
    
      
      #SampleID
      BarcodeSequence
      LinkerPrimerSequence
      ReversePrimer
      Description
    
  
  
    
      60
      Orwoll.BI0023.BI
      TCTGGTGACATT
      GGACTACHVGGGTWTCTAAT
      GTGCCAGCMGCCGCGGTAA
      Orwoll.BI0023.BI
    
    
      649
      Orwoll.BI0056.BI
      CAAGCATGCCTA
      GGACTACHVGGGTWTCTAAT
      GTGCCAGCMGCCGCGGTAA
      Orwoll.BI0056.BI
    
    
      75
      Orwoll.BI0131.BI
      CTATTTGCGACA
      GGACTACHVGGGTWTCTAAT
      GTGCCAGCMGCCGCGGTAA
      Orwoll.BI0131.BI
    
    
      932
      Orwoll.BI0153.BI
      ATCGGCGTTACA
      GGACTACHVGGGTWTCTAAT
      GTGCCAGCMGCCGCGGTAA
      Orwoll.BI0153.BI
    
    
      833
      Orwoll.BI0215.BI
      CCTCTCGTGATC
      GGACTACHVGGGTWTCTAAT
      GTGCCAGCMGCCGCGGTAA
      Orwoll.BI0215.BI



In [6]:

    
info.shape[0]









    Out[6]:





600



In [7]:

    
n = info.shape[0]
data = {'Sample_Name': info.loc[:,'#SampleID'].str[7:13].values, # .values takes only value not index
       'Barcode': info.loc[:,'BarcodeSequence'].values,
       'LinkerPrimerSequence': info.loc[:, 'LinkerPrimerSequence'].values,
       'Description': info.loc[:,'Description'].values,
       'Experiment_Design_Description': np.repeat(
           '16S stool samples sequenced for MrOS Vitamin D study', n),
       'Library_Construction_Protocol': np.repeat('16S rRNA v4', n),
       'Linker': np.repeat('GT', n),
       'Platform': np.repeat('Illumina', n),
       'Center_Name': info.loc[:,'#SampleID'].str[14:16].values,
       'Center_Project': np.repeat('MrOS', n),
       'Instrument_Model': np.repeat('Illumina MiSeq', n)}
prep_info = pd.DataFrame(data, columns=['Sample_Name', 'Barcode', 'LinkerPrimerSequence', 
                                        'Description', 
                                        'Experiment_Design_Description',
                                        'Library_Construction_Protocol', 'Linker', 'Platform',
                                        'Center_Name', 'Center_Project', 'Instrument_Model'])



In [8]:

    
prep_info.shape









    Out[8]:





(600, 11)



In [9]:

    
prep_info.head(10)









    Out[9]:







  
    
      
      Sample_Name
      Barcode
      LinkerPrimerSequence
      Description
      Experiment_Design_Description
      Library_Construction_Protocol
      Linker
      Platform
      Center_Name
      Center_Project
      Instrument_Model
    
  
  
    
      0
      BI0023
      TCTGGTGACATT
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0023.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      1
      BI0056
      CAAGCATGCCTA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0056.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      2
      BI0131
      CTATTTGCGACA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0131.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      3
      BI0153
      ATCGGCGTTACA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0153.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      4
      BI0215
      CCTCTCGTGATC
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0215.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      5
      BI0353
      TGCCATCTGAAT
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0353.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      6
      BI0371
      GAATAGAGCCAA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0371.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      7
      BI0372
      ATGCCAACCAAC
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0372.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      8
      BI0380
      GAGTCGCGTTTA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0380.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      9
      BI0389
      GCTATGGACCGA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0389.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq



In [10]:

    
# fix mismatch sample name 'BI0778' should be 'BIO778' (Nora's email on08/07/2017)
prep_info = prep_info.replace(to_replace='BIO778', value='BI0778')

Create sample_info document



In [2]:

    
samples_part1 = pd.read_csv('../data/VitDMetadata_update.csv', sep=',')
samples_part2 = pd.read_csv('../data/Other32metadata.csv', sep=',')



In [3]:

    
print(samples_part1.shape)
print(samples_part2.shape)









    



(567, 26)
(32, 26)



In [4]:

    
print(samples_part1.columns)
print(samples_part2.columns)









    



Index(['ID', 'GIERACE', 'SITE', 'TUDRAMT', 'HWWGT', 'V4AGE1', 'HWHGT',
       'PASCORE', 'HWBMI', 'TURSMOKE', 'DTVITD', 'M1ADEPR', 'M1VITMND',
       'M1ANTIB', 'M1PROBI', 'OHV1D3', 'OHV24D3', 'OHVD3', 'OHVD2', 'OHV1D2',
       'OHV1D2CT', 'OHVD2CT', 'OHVDTOT', 'OHV1DTOT', 'OHSEAS', 'VDstatus'],
      dtype='object')
Index(['ID', 'GIERACE', 'SITE', 'TUDRAMT', 'HWWGT', 'V4AGE1', 'HWHGT',
       'PASCORE', 'HWBMI', 'TURSMOKE', 'DTVITD', 'M1ADEPR', 'M1VITMND',
       'M1ANTIB', 'M1PROBI', 'OHV1D3', 'OHV24D3', 'OHVD3', 'OHVD2', 'OHV1D2',
       'OHV1D2CT', 'OHVD2CT', 'OHVDTOT', 'OHV1DTOT', 'OHSEAS', 'VDstatus'],
      dtype='object')



In [5]:

    
# merge two metadata files
samples = pd.concat([samples_part1, samples_part2], ignore_index=True)



In [6]:

    
samples.shape









    Out[6]:





(599, 26)



In [7]:

    
#samples.M1ANTIB.value_counts()









    Out[7]:





0: No     558
1: Yes     41
Name: M1ANTIB, dtype: int64



In [16]:

    
len(pd.unique(samples.ID))









    Out[16]:





599



In [17]:

    
samples.head(10)









    Out[17]:







  
    
      
      ID
      GIERACE
      SITE
      TUDRAMT
      HWWGT
      V4AGE1
      HWHGT
      PASCORE
      HWBMI
      TURSMOKE
      ...
      OHV24D3
      OHVD3
      OHVD2
      OHV1D2
      OHV1D2CT
      OHVD2CT
      OHVDTOT
      OHV1DTOT
      OHSEAS
      VDstatus
    
  
  
    
      0
      BI0023
      1:WHITE
      Birmingham
      1: Less than one drink per week
      83.1
      83
      169.60
      91.000000
      28.890119
      M:Not Applicable
      ...
      1.77
      25.8
      0
      0
      1: Yes
      1: Yes
      25.8
      39.3
      3:SUMMER
      sufficiency
    
    
      1
      BI0056
      1:WHITE
      Birmingham
      0:None drinker
      76.2
      81
      163.40
      199.178571
      28.539796
      1:PAST
      ...
      3.91
      39.2
      0
      0
      1: Yes
      1: Yes
      39.2
      61.9
      2:SPRING
      sufficiency
    
    
      2
      BI0131
      1:WHITE
      Birmingham
      0:None drinker
      78.5
      83
      177.15
      161.714286
      25.014242
      1:PAST
      ...
      1.49
      23.1
      0
      0
      1: Yes
      1: Yes
      23.1
      52.1
      2:SPRING
      sufficiency
    
    
      3
      BI0153
      1:WHITE
      Birmingham
      4: 6-13 drinks per week
      95.1
      79
      175.50
      88.214286
      30.876373
      1:PAST
      ...
      2.14
      27.3
      0
      0
      1: Yes
      1: Yes
      27.3
      43.1
      2:SPRING
      sufficiency
    
    
      4
      BI0215
      1:WHITE
      Birmingham
      3: 3-5 drinks per week
      93.0
      81
      166.40
      256.821429
      33.587394
      1:PAST
      ...
      3.62
      33
      0
      0
      1: Yes
      1: Yes
      33
      50.2
      4:FALL
      sufficiency
    
    
      5
      BI0353
      1:WHITE
      Birmingham
      0:None drinker
      83.6
      80
      177.90
      179.571429
      26.415229
      1:PAST
      ...
      1.79
      19.5
      0
      0
      1: Yes
      1: Yes
      19.5
      45.5
      2:SPRING
      deficiency
    
    
      6
      BI0371
      2:AFRICAN AMERICAN
      Birmingham
      3: 3-5 drinks per week
      102.6
      79
      190.10
      144.571429
      28.391159
      0:NO
      ...
      1.82
      22.3
      0
      0
      1: Yes
      1: Yes
      22.3
      60
      2:SPRING
      sufficiency
    
    
      7
      BI0372
      1:WHITE
      Birmingham
      1: Less than one drink per week
      85.3
      80
      175.90
      91.678571
      27.568768
      0:NO
      ...
      3.37
      37.2
      0.7
      0
      1: Yes
      0: No
      37.9
      53.6
      2:SPRING
      sufficiency
    
    
      8
      BI0380
      1:WHITE
      Birmingham
      0:None drinker
      110.6
      85
      192.35
      65.607143
      29.893086
      1:PAST
      ...
      2.55
      45.7
      0
      0
      1: Yes
      1: Yes
      45.7
      50.2
      2:SPRING
      sufficiency
    
    
      9
      BI0389
      1:WHITE
      Birmingham
      0:None drinker
      72.2
      85
      177.70
      209.714286
      22.864533
      0:NO
      ...
      1.93
      26.7
      0
      0
      1: Yes
      1: Yes
      26.7
      49.7
      2:SPRING
      sufficiency
    
  

10 rows × 26 columns



In [18]:

    
sites = samples.loc[:, 'SITE']



In [19]:

    
# reference: http://www.latlong.net/
Latitude = []
Longitude = []
sites = samples.loc[:, 'SITE']
for i in range(samples.shape[0]):
    if sites[i] == 'Birmingham':
        latitude = '33.520661'
        longitude = '-86.80249'
    elif sites[i] == 'San Diego':
        latitude = '32.715738'
        longitude = '-117.1611'
    elif sites[i] == 'Pittsburgh':
        latitude = '40.440625'
        longitude = '-79.99589'
    elif sites[i] == 'Palo Alto':
        latitude = '37.441883'
        longitude = '-122.143'
    elif sites[i] == 'Portland':
        latitude = '45.523062'
        longitude = '-122.6765'    
    elif sites[i] == 'Minneapolis':
        latitude = '44.977753'
        longitude = '-93.26501'  
    Latitude.append(latitude)
    Longitude.append(longitude)



In [20]:

    
# simple check
print(samples['SITE'][[1,90,200,300,400, 500]])
print(np.array(Latitude)[[1,90,200,300,400, 500]])
print(np.array(Longitude)[[1,90,200,300,400, 500]])









    



1       Birmingham
90     Minneapolis
200      Palo Alto
300     Pittsburgh
400       Portland
500      San Diego
Name: SITE, dtype: object
['33.520661' '44.977753' '37.441883' '40.440625' '45.523062' '32.715738']
['-86.80249' '-93.26501' '-122.143' '-79.99589' '-122.6765' '-117.1611']



In [21]:

    
m = samples.shape[0]
required = {'Sample_Name': samples.loc[:,'ID'].values,
        'Title': np.repeat('MrOS_VitaminD', m),
        'Anonymized_Name': samples.loc[:,'ID'].values,
        'Scientific_Name': np.repeat('human gut metagenome', m),
        'Taxon_ID': np.repeat('Not applicable', m),
        #'Description': np.repeat('Not applicable', m),
        'Sample_Type': np.repeat('stool', m),
        'Geo_Loc_Name': samples.loc[:, 'SITE'].values,
        'Elevation': np.repeat('Not applicable', m),
        'Env_Biome': np.repeat('urban biome', m),
        'Env_Feature': np.repeat('human-associated habitat', m),
        'Env_Material': np.repeat('feces', m),
        'Env_Package': np.repeat('human-gut', m),
        'Latitude': Latitude,
        'Longitude': Longitude,
        'Collection_Timestamp': np.repeat('Not applicable', m),
        'DNA_Extracted': np.repeat('Not applicable', m),
        'Physical_Specimen_Location': np.repeat('Not applicable', m),
        'Physical_Specimen_Remaining': np.repeat('Not applicable', m),
        'Age': samples.loc[:,'V4AGE1'].values,
        'Age_Units': np.repeat('years', m),
        'Host_Subject_ID': np.repeat('Not applicable', m),
        'Host_Taxid': np.repeat('Not applicable', m),
        'Host_Scientific_Name': np.repeat('Homo sapiens', m),
        'Host_Common_Name': np.repeat('human', m),
        'Life_Stage': np.repeat('adult', m),
        'Sex': np.repeat('male', m),
        'Height': samples.loc[:, 'HWHGT'].values,
        'Height_Units': np.repeat('cm', m),
        'Weight': samples.loc[:, 'HWWGT'].values,
        'Weight_Units': np.repeat('kg', m),  
        'BMI': samples.loc[:, 'HWBMI'].values,
        'Body_Habitat': np.repeat('UBERON:feces', m),
        'Body_Site': np.repeat('UBERON:feces', m),
        'Body_Product': np.repeat('UBERON:feces', m)}



In [22]:

    
sample_info = pd.concat([pd.DataFrame(required), samples], axis=1)
sample_info = pd.DataFrame(sample_info, columns = ['Sample_Name', 'Title', 'Anonymized_Name',
          'Scientific_Name', 'Taxon_ID', 
          #'Description', 
          'Sample_Type', 'Geo_Loc_Name', 'Elevation',
          'Env_Biome', 'Env_Feature', 'Env_Material', 'Env_Package', 'Latitude', 'Longitude',
          'Collection_Timestamp', 'DNA_Extracted', 'Physical_Specimen_Location', 'Physical_Specimen_Remaining',
          'Age', 'Age_Units', 'Host_Subject_ID', 'Host_Taxid', 'Host_Scientific_Name', 'Host_Common_Name',
          'Life_Stage', 'Sex', 'Height', 'Height_Units', 'Weight', 'Weight_Units', 'BMI', 'Body_Habitat',
          'Body_Site', 'Body_Product', 'GIERACE', 'SITE', 'TUDRAMT', 'PASCORE', 'TURSMOKE', 'DTVITD', 
          'M1ADEPR', 'M1VITMND', 'M1ANTIB', 'M1PROBI', 'OHV1D3', 'OHV24D3', 
          'OHVD3', 'OHVD2', 'OHV1D2', 'OHV1D2CT', 'OHVD2CT', 'OHVDTOT', 'OHV1DTOT', 'OHSEAS', 'VDstatus'])



In [23]:

    
sample_info.shape









    Out[23]:





(599, 55)



In [24]:

    
sample_info.head(10)









    Out[24]:







  
    
      
      Sample_Name
      Title
      Anonymized_Name
      Scientific_Name
      Taxon_ID
      Sample_Type
      Geo_Loc_Name
      Elevation
      Env_Biome
      Env_Feature
      ...
      OHV24D3
      OHVD3
      OHVD2
      OHV1D2
      OHV1D2CT
      OHVD2CT
      OHVDTOT
      OHV1DTOT
      OHSEAS
      VDstatus
    
  
  
    
      0
      BI0023
      MrOS_VitaminD
      BI0023
      human gut metagenome
      Not applicable
      stool
      Birmingham
      Not applicable
      urban biome
      human-associated habitat
      ...
      1.77
      25.8
      0
      0
      1: Yes
      1: Yes
      25.8
      39.3
      3:SUMMER
      sufficiency
    
    
      1
      BI0056
      MrOS_VitaminD
      BI0056
      human gut metagenome
      Not applicable
      stool
      Birmingham
      Not applicable
      urban biome
      human-associated habitat
      ...
      3.91
      39.2
      0
      0
      1: Yes
      1: Yes
      39.2
      61.9
      2:SPRING
      sufficiency
    
    
      2
      BI0131
      MrOS_VitaminD
      BI0131
      human gut metagenome
      Not applicable
      stool
      Birmingham
      Not applicable
      urban biome
      human-associated habitat
      ...
      1.49
      23.1
      0
      0
      1: Yes
      1: Yes
      23.1
      52.1
      2:SPRING
      sufficiency
    
    
      3
      BI0153
      MrOS_VitaminD
      BI0153
      human gut metagenome
      Not applicable
      stool
      Birmingham
      Not applicable
      urban biome
      human-associated habitat
      ...
      2.14
      27.3
      0
      0
      1: Yes
      1: Yes
      27.3
      43.1
      2:SPRING
      sufficiency
    
    
      4
      BI0215
      MrOS_VitaminD
      BI0215
      human gut metagenome
      Not applicable
      stool
      Birmingham
      Not applicable
      urban biome
      human-associated habitat
      ...
      3.62
      33
      0
      0
      1: Yes
      1: Yes
      33
      50.2
      4:FALL
      sufficiency
    
    
      5
      BI0353
      MrOS_VitaminD
      BI0353
      human gut metagenome
      Not applicable
      stool
      Birmingham
      Not applicable
      urban biome
      human-associated habitat
      ...
      1.79
      19.5
      0
      0
      1: Yes
      1: Yes
      19.5
      45.5
      2:SPRING
      deficiency
    
    
      6
      BI0371
      MrOS_VitaminD
      BI0371
      human gut metagenome
      Not applicable
      stool
      Birmingham
      Not applicable
      urban biome
      human-associated habitat
      ...
      1.82
      22.3
      0
      0
      1: Yes
      1: Yes
      22.3
      60
      2:SPRING
      sufficiency
    
    
      7
      BI0372
      MrOS_VitaminD
      BI0372
      human gut metagenome
      Not applicable
      stool
      Birmingham
      Not applicable
      urban biome
      human-associated habitat
      ...
      3.37
      37.2
      0.7
      0
      1: Yes
      0: No
      37.9
      53.6
      2:SPRING
      sufficiency
    
    
      8
      BI0380
      MrOS_VitaminD
      BI0380
      human gut metagenome
      Not applicable
      stool
      Birmingham
      Not applicable
      urban biome
      human-associated habitat
      ...
      2.55
      45.7
      0
      0
      1: Yes
      1: Yes
      45.7
      50.2
      2:SPRING
      sufficiency
    
    
      9
      BI0389
      MrOS_VitaminD
      BI0389
      human gut metagenome
      Not applicable
      stool
      Birmingham
      Not applicable
      urban biome
      human-associated habitat
      ...
      1.93
      26.7
      0
      0
      1: Yes
      1: Yes
      26.7
      49.7
      2:SPRING
      sufficiency
    
  

10 rows × 55 columns

Match sample_name in two documents



In [25]:

    
id_prep=list(prep_info.loc[:,'Sample_Name'].values)
id_sample=list(sample_info.loc[:,'Sample_Name'].values)



In [26]:

    
# samples with metadata yet no sequencing data
for i in id_sample:
    if i not in id_prep:
        print(i)



In [27]:

    
# samples with sequencing data yet no metadata
no_sample_info = []
for j in id_prep:
    if j not in id_sample:
        no_sample_info.append(j)



In [28]:

    
print(len(no_sample_info))
print(no_sample_info)









    



1
['PO7100']

Drop un-needed samples



In [29]:

    
# exclude PO7100, as it has no microbiome data (see Lily Liu's email on 08/07/2017)
prep_info = prep_info.loc[~prep_info['Sample_Name'].isin(no_sample_info)]



In [30]:

    
prep_info.shape









    Out[30]:





(599, 11)

preparation and sample information output



In [31]:

    
sample_info.to_csv('../data/sample_MrOS.txt', sep= '\t', na_rep='Missing:not collected', index=False)
prep_info.to_csv('../data/prep_MrOS.txt', sep= '\t', index=False)

create Qiime mapping file



In [32]:

    
prep_info.head(5)









    Out[32]:







  
    
      
      Sample_Name
      Barcode
      LinkerPrimerSequence
      Description
      Experiment_Design_Description
      Library_Construction_Protocol
      Linker
      Platform
      Center_Name
      Center_Project
      Instrument_Model
    
  
  
    
      0
      BI0023
      TCTGGTGACATT
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0023.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      1
      BI0056
      CAAGCATGCCTA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0056.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      2
      BI0131
      CTATTTGCGACA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0131.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      3
      BI0153
      ATCGGCGTTACA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0153.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
    
    
      4
      BI0215
      CCTCTCGTGATC
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0215.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq



In [33]:

    
mapping = pd.merge(prep_info, sample_info, on='Sample_Name')



In [34]:

    
print(prep_info.shape)
print(sample_info.shape)
print(mapping.shape)









    



(599, 11)
(599, 55)
(599, 65)



In [35]:

    
mapping.rename(columns={'Sample_Name': '#SampleID'}, inplace=True)
mapping.rename(columns={'Barcode': 'BarcodeSequence'}, inplace=True)



In [36]:

    
mapping.shape









    Out[36]:





(599, 65)



In [37]:

    
mapping.head(5)









    Out[37]:







  
    
      
      #SampleID
      BarcodeSequence
      LinkerPrimerSequence
      Description
      Experiment_Design_Description
      Library_Construction_Protocol
      Linker
      Platform
      Center_Name
      Center_Project
      ...
      OHV24D3
      OHVD3
      OHVD2
      OHV1D2
      OHV1D2CT
      OHVD2CT
      OHVDTOT
      OHV1DTOT
      OHSEAS
      VDstatus
    
  
  
    
      0
      BI0023
      TCTGGTGACATT
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0023.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      ...
      1.77
      25.8
      0
      0
      1: Yes
      1: Yes
      25.8
      39.3
      3:SUMMER
      sufficiency
    
    
      1
      BI0056
      CAAGCATGCCTA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0056.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      ...
      3.91
      39.2
      0
      0
      1: Yes
      1: Yes
      39.2
      61.9
      2:SPRING
      sufficiency
    
    
      2
      BI0131
      CTATTTGCGACA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0131.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      ...
      1.49
      23.1
      0
      0
      1: Yes
      1: Yes
      23.1
      52.1
      2:SPRING
      sufficiency
    
    
      3
      BI0153
      ATCGGCGTTACA
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0153.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      ...
      2.14
      27.3
      0
      0
      1: Yes
      1: Yes
      27.3
      43.1
      2:SPRING
      sufficiency
    
    
      4
      BI0215
      CCTCTCGTGATC
      GGACTACHVGGGTWTCTAAT
      Orwoll.BI0215.BI
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      ...
      3.62
      33
      0
      0
      1: Yes
      1: Yes
      33
      50.2
      4:FALL
      sufficiency
    
  

5 rows × 65 columns



In [38]:

    
# move 'Description' to the end
cols = list(mapping)
cols.insert(mapping.shape[1], cols.pop(cols.index('Description')))
mapping = mapping.loc[:, cols]



In [39]:

    
mapping.head(5)









    Out[39]:







  
    
      
      #SampleID
      BarcodeSequence
      LinkerPrimerSequence
      Experiment_Design_Description
      Library_Construction_Protocol
      Linker
      Platform
      Center_Name
      Center_Project
      Instrument_Model
      ...
      OHVD3
      OHVD2
      OHV1D2
      OHV1D2CT
      OHVD2CT
      OHVDTOT
      OHV1DTOT
      OHSEAS
      VDstatus
      Description
    
  
  
    
      0
      BI0023
      TCTGGTGACATT
      GGACTACHVGGGTWTCTAAT
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
      ...
      25.8
      0
      0
      1: Yes
      1: Yes
      25.8
      39.3
      3:SUMMER
      sufficiency
      Orwoll.BI0023.BI
    
    
      1
      BI0056
      CAAGCATGCCTA
      GGACTACHVGGGTWTCTAAT
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
      ...
      39.2
      0
      0
      1: Yes
      1: Yes
      39.2
      61.9
      2:SPRING
      sufficiency
      Orwoll.BI0056.BI
    
    
      2
      BI0131
      CTATTTGCGACA
      GGACTACHVGGGTWTCTAAT
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
      ...
      23.1
      0
      0
      1: Yes
      1: Yes
      23.1
      52.1
      2:SPRING
      sufficiency
      Orwoll.BI0131.BI
    
    
      3
      BI0153
      ATCGGCGTTACA
      GGACTACHVGGGTWTCTAAT
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
      ...
      27.3
      0
      0
      1: Yes
      1: Yes
      27.3
      43.1
      2:SPRING
      sufficiency
      Orwoll.BI0153.BI
    
    
      4
      BI0215
      CCTCTCGTGATC
      GGACTACHVGGGTWTCTAAT
      16S stool samples sequenced for MrOS Vitamin D...
      16S rRNA v4
      GT
      Illumina
      BI
      MrOS
      Illumina MiSeq
      ...
      33
      0
      0
      1: Yes
      1: Yes
      33
      50.2
      4:FALL
      sufficiency
      Orwoll.BI0215.BI
    
  

5 rows × 65 columns



In [40]:

    
mapping.to_csv('../data/mapping_MrOS.txt', sep= '\t', index=False)



In [ ]:

	#SampleID	BarcodeSequence	LinkerPrimerSequence	ReversePrimer	Description
60	Orwoll.BI0023.BI	TCTGGTGACATT	GGACTACHVGGGTWTCTAAT	GTGCCAGCMGCCGCGGTAA	Orwoll.BI0023.BI
1129	Orwoll.BI0023.BI	TCTGGTGACATT	GGACTACHVGGGTWTCTAAT	GTGCCAGCMGCCGCGGTAA	Orwoll.BI0023.BI
649	Orwoll.BI0056.BI	CAAGCATGCCTA	GGACTACHVGGGTWTCTAAT	GTGCCAGCMGCCGCGGTAA	Orwoll.BI0056.BI
475	Orwoll.BI0056.BI	CAAGCATGCCTA	GGACTACHVGGGTWTCTAAT	GTGCCAGCMGCCGCGGTAA	Orwoll.BI0056.BI
75	Orwoll.BI0131.BI	CTATTTGCGACA	GGACTACHVGGGTWTCTAAT	GTGCCAGCMGCCGCGGTAA	Orwoll.BI0131.BI

	Sample_Name	Barcode	LinkerPrimerSequence	Description	Experiment_Design_Description	Library_Construction_Protocol	Linker	Platform	Center_Name	Center_Project	Instrument_Model
0	BI0023	TCTGGTGACATT	GGACTACHVGGGTWTCTAAT	Orwoll.BI0023.BI	16S stool samples sequenced for MrOS Vitamin D...	16S rRNA v4	GT	Illumina	BI	MrOS	Illumina MiSeq
1	BI0056	CAAGCATGCCTA	GGACTACHVGGGTWTCTAAT	Orwoll.BI0056.BI	16S stool samples sequenced for MrOS Vitamin D...	16S rRNA v4	GT	Illumina	BI	MrOS	Illumina MiSeq
2	BI0131	CTATTTGCGACA	GGACTACHVGGGTWTCTAAT	Orwoll.BI0131.BI	16S stool samples sequenced for MrOS Vitamin D...	16S rRNA v4	GT	Illumina	BI	MrOS	Illumina MiSeq
3	BI0153	ATCGGCGTTACA	GGACTACHVGGGTWTCTAAT	Orwoll.BI0153.BI	16S stool samples sequenced for MrOS Vitamin D...	16S rRNA v4	GT	Illumina	BI	MrOS	Illumina MiSeq
4	BI0215	CCTCTCGTGATC	GGACTACHVGGGTWTCTAAT	Orwoll.BI0215.BI	16S stool samples sequenced for MrOS Vitamin D...	16S rRNA v4	GT	Illumina	BI	MrOS	Illumina MiSeq
5	BI0353	TGCCATCTGAAT	GGACTACHVGGGTWTCTAAT	Orwoll.BI0353.BI	16S stool samples sequenced for MrOS Vitamin D...	16S rRNA v4	GT	Illumina	BI	MrOS	Illumina MiSeq
6	BI0371	GAATAGAGCCAA	GGACTACHVGGGTWTCTAAT	Orwoll.BI0371.BI	16S stool samples sequenced for MrOS Vitamin D...	16S rRNA v4	GT	Illumina	BI	MrOS	Illumina MiSeq
7	BI0372	ATGCCAACCAAC	GGACTACHVGGGTWTCTAAT	Orwoll.BI0372.BI	16S stool samples sequenced for MrOS Vitamin D...	16S rRNA v4	GT	Illumina	BI	MrOS	Illumina MiSeq
8	BI0380	GAGTCGCGTTTA	GGACTACHVGGGTWTCTAAT	Orwoll.BI0380.BI	16S stool samples sequenced for MrOS Vitamin D...	16S rRNA v4	GT	Illumina	BI	MrOS	Illumina MiSeq
9	BI0389	GCTATGGACCGA	GGACTACHVGGGTWTCTAAT	Orwoll.BI0389.BI	16S stool samples sequenced for MrOS Vitamin D...	16S rRNA v4	GT	Illumina	BI	MrOS	Illumina MiSeq

	ID	GIERACE	SITE	TUDRAMT	HWWGT	V4AGE1	HWHGT	PASCORE	HWBMI	TURSMOKE	...	OHV24D3	OHVD3	OHVD2	OHV1D2CT	OHVD2CT	OHVDTOT	OHV1DTOT	OHSEAS	VDstatus
0	BI0023	1:WHITE	Birmingham	1: Less than one drink per week	83.1	83	169.60	91.000000	28.890119	M:Not Applicable	...	1.77	25.8	0	1: Yes	1: Yes	25.8	39.3	3:SUMMER	sufficiency
1	BI0056	1:WHITE	Birmingham	0:None drinker	76.2	81	163.40	199.178571	28.539796	1:PAST	...	3.91	39.2	0	1: Yes	1: Yes	39.2	61.9	2:SPRING	sufficiency
2	BI0131	1:WHITE	Birmingham	0:None drinker	78.5	83	177.15	161.714286	25.014242	1:PAST	...	1.49	23.1	0	1: Yes	1: Yes	23.1	52.1	2:SPRING	sufficiency
3	BI0153	1:WHITE	Birmingham	4: 6-13 drinks per week	95.1	79	175.50	88.214286	30.876373	1:PAST	...	2.14	27.3	0	1: Yes	1: Yes	27.3	43.1	2:SPRING	sufficiency
4	BI0215	1:WHITE	Birmingham	3: 3-5 drinks per week	93.0	81	166.40	256.821429	33.587394	1:PAST	...	3.62	33	0	1: Yes	1: Yes	33	50.2	4:FALL	sufficiency
5	BI0353	1:WHITE	Birmingham	0:None drinker	83.6	80	177.90	179.571429	26.415229	1:PAST	...	1.79	19.5	0	1: Yes	1: Yes	19.5	45.5	2:SPRING	deficiency
6	BI0371	2:AFRICAN AMERICAN	Birmingham	3: 3-5 drinks per week	102.6	79	190.10	144.571429	28.391159	0:NO	...	1.82	22.3	0	1: Yes	1: Yes	22.3	60	2:SPRING	sufficiency
7	BI0372	1:WHITE	Birmingham	1: Less than one drink per week	85.3	80	175.90	91.678571	27.568768	0:NO	...	3.37	37.2	0.7	1: Yes	0: No	37.9	53.6	2:SPRING	sufficiency
8	BI0380	1:WHITE	Birmingham	0:None drinker	110.6	85	192.35	65.607143	29.893086	1:PAST	...	2.55	45.7	0	1: Yes	1: Yes	45.7	50.2	2:SPRING	sufficiency
9	BI0389	1:WHITE	Birmingham	0:None drinker	72.2	85	177.70	209.714286	22.864533	0:NO	...	1.93	26.7	0	1: Yes	1: Yes	26.7	49.7	2:SPRING	sufficiency

	Sample_Name	Title	Anonymized_Name	Scientific_Name	Taxon_ID	Sample_Type	Geo_Loc_Name	Elevation	Env_Biome	Env_Feature	...	OHV24D3	OHVD3	OHVD2	OHV1D2CT	OHVD2CT	OHVDTOT	OHV1DTOT	OHSEAS	VDstatus
0	BI0023	MrOS_VitaminD	BI0023	human gut metagenome	Not applicable	stool	Birmingham	Not applicable	urban biome	human-associated habitat	...	1.77	25.8	0	1: Yes	1: Yes	25.8	39.3	3:SUMMER	sufficiency
1	BI0056	MrOS_VitaminD	BI0056	human gut metagenome	Not applicable	stool	Birmingham	Not applicable	urban biome	human-associated habitat	...	3.91	39.2	0	1: Yes	1: Yes	39.2	61.9	2:SPRING	sufficiency
2	BI0131	MrOS_VitaminD	BI0131	human gut metagenome	Not applicable	stool	Birmingham	Not applicable	urban biome	human-associated habitat	...	1.49	23.1	0	1: Yes	1: Yes	23.1	52.1	2:SPRING	sufficiency
3	BI0153	MrOS_VitaminD	BI0153	human gut metagenome	Not applicable	stool	Birmingham	Not applicable	urban biome	human-associated habitat	...	2.14	27.3	0	1: Yes	1: Yes	27.3	43.1	2:SPRING	sufficiency
4	BI0215	MrOS_VitaminD	BI0215	human gut metagenome	Not applicable	stool	Birmingham	Not applicable	urban biome	human-associated habitat	...	3.62	33	0	1: Yes	1: Yes	33	50.2	4:FALL	sufficiency
5	BI0353	MrOS_VitaminD	BI0353	human gut metagenome	Not applicable	stool	Birmingham	Not applicable	urban biome	human-associated habitat	...	1.79	19.5	0	1: Yes	1: Yes	19.5	45.5	2:SPRING	deficiency
6	BI0371	MrOS_VitaminD	BI0371	human gut metagenome	Not applicable	stool	Birmingham	Not applicable	urban biome	human-associated habitat	...	1.82	22.3	0	1: Yes	1: Yes	22.3	60	2:SPRING	sufficiency
7	BI0372	MrOS_VitaminD	BI0372	human gut metagenome	Not applicable	stool	Birmingham	Not applicable	urban biome	human-associated habitat	...	3.37	37.2	0.7	1: Yes	0: No	37.9	53.6	2:SPRING	sufficiency
8	BI0380	MrOS_VitaminD	BI0380	human gut metagenome	Not applicable	stool	Birmingham	Not applicable	urban biome	human-associated habitat	...	2.55	45.7	0	1: Yes	1: Yes	45.7	50.2	2:SPRING	sufficiency
9	BI0389	MrOS_VitaminD	BI0389	human gut metagenome	Not applicable	stool	Birmingham	Not applicable	urban biome	human-associated habitat	...	1.93	26.7	0	1: Yes	1: Yes	26.7	49.7	2:SPRING	sufficiency